{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2d87ad23-587a-4b20-8121-1d1748ac301a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting transformers\n",
      "  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m50.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting accelerate\n",
      "  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.6/227.6 kB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.9.0)\n",
      "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n",
      "  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m59.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.24.1)\n",
      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n",
      "Collecting regex!=2019.12.17 (from transformers)\n",
      "  Downloading regex-2023.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.4/770.4 kB\u001b[0m \u001b[31m50.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.28.1)\n",
      "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n",
      "  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m99.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n",
      "  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m111.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting tqdm>=4.27 (from transformers)\n",
      "  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.1/77.1 kB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
      "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.0.1+cu117)\n",
      "Collecting fsspec (from huggingface-hub<1.0,>=0.14.1->transformers)\n",
      "  Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.8/163.8 kB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.4.0)\n",
      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (1.11.1)\n",
      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (3.0)\n",
      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (3.1.2)\n",
      "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (2.0.0)\n",
      "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->accelerate) (3.25.0)\n",
      "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->accelerate) (15.0.7)\n",
      "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.1.1)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.13)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->accelerate) (2.1.2)\n",
      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->accelerate) (1.2.1)\n",
      "Installing collected packages: tokenizers, safetensors, tqdm, regex, fsspec, huggingface-hub, transformers, accelerate\n",
      "Successfully installed accelerate-0.20.3 fsspec-2023.6.0 huggingface-hub-0.15.1 regex-2023.6.3 safetensors-0.3.1 tokenizers-0.13.3 tqdm-4.65.0 transformers-4.30.2\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!pip install transformers accelerate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "52e4776c-8820-4ee6-9ae4-9db51e2ed365",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "device(type='cuda', index=0)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import requests\n",
    "from PIL import Image\n",
    "from transformers import Blip2Processor, Blip2ForConditionalGeneration\n",
    "import torch\n",
    "import os\n",
    "\n",
    "device = torch.device(\"cuda\", 0)\n",
    "device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e4ad6102-160e-487d-99c0-da50a52a5e4e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6b01bf8e2d2a4680ba09d412a2a0286d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)rocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d927a13d206a467388e7afbd449b7238",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)okenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9567eaeb793c4ab1875049fc2e0c2375",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "047288537e9d4f989e238c1e7789767a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8b31492abb98403c96b92a2a06ddd709",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2f17a1a3b4fd4059beefd3abb3b53184",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "62da54d46d4546a28df4e43f3ec1696b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)lve/main/config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "07e7b68353da4f1ea57a5563b6aaa5f7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)model.bin.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "db9254ad28eb424088dae1d4639ca28b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3466cdec205f459f8c4aacf2b0d5fb3f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "00ee3c753f444d93b07969cadb5a8d99",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.50G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4a411c6523fc49c492374747307eee1f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "processor = Blip2Processor.from_pretrained(\"Salesforce/blip2-opt-2.7b\")\n",
    "model = Blip2ForConditionalGeneration.from_pretrained(\"Salesforce/blip2-opt-2.7b\", torch_dtype=torch.float16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2d87ea9b-a43c-4585-965c-03b3919cceaf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Blip2ForConditionalGeneration(\n",
       "  (vision_model): Blip2VisionModel(\n",
       "    (embeddings): Blip2VisionEmbeddings(\n",
       "      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))\n",
       "    )\n",
       "    (encoder): Blip2Encoder(\n",
       "      (layers): ModuleList(\n",
       "        (0-38): 39 x Blip2EncoderLayer(\n",
       "          (self_attn): Blip2Attention(\n",
       "            (dropout): Dropout(p=0.0, inplace=False)\n",
       "            (qkv): Linear(in_features=1408, out_features=4224, bias=True)\n",
       "            (projection): Linear(in_features=1408, out_features=1408, bias=True)\n",
       "          )\n",
       "          (layer_norm1): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "          (mlp): Blip2MLP(\n",
       "            (activation_fn): GELUActivation()\n",
       "            (fc1): Linear(in_features=1408, out_features=6144, bias=True)\n",
       "            (fc2): Linear(in_features=6144, out_features=1408, bias=True)\n",
       "          )\n",
       "          (layer_norm2): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (post_layernorm): LayerNorm((1408,), eps=1e-05, elementwise_affine=True)\n",
       "  )\n",
       "  (qformer): Blip2QFormerModel(\n",
       "    (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "    (dropout): Dropout(p=0.1, inplace=False)\n",
       "    (encoder): Blip2QFormerEncoder(\n",
       "      (layer): ModuleList(\n",
       "        (0): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (crossattention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (1): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (2): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (crossattention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (3): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (4): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (crossattention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (5): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (6): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (crossattention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (7): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (8): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (crossattention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (9): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (10): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (crossattention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=1408, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "        (11): Blip2QFormerLayer(\n",
       "          (attention): Blip2QFormerAttention(\n",
       "            (attention): Blip2QFormerMultiHeadAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): Blip2QFormerSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate_query): Blip2QFormerIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output_query): Blip2QFormerOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  )\n",
       "  (language_projection): Linear(in_features=768, out_features=2560, bias=True)\n",
       "  (language_model): OPTForCausalLM(\n",
       "    (model): OPTModel(\n",
       "      (decoder): OPTDecoder(\n",
       "        (embed_tokens): Embedding(50272, 2560, padding_idx=1)\n",
       "        (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)\n",
       "        (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
       "        (layers): ModuleList(\n",
       "          (0-31): 32 x OPTDecoderLayer(\n",
       "            (self_attn): OPTAttention(\n",
       "              (k_proj): Linear(in_features=2560, out_features=2560, bias=True)\n",
       "              (v_proj): Linear(in_features=2560, out_features=2560, bias=True)\n",
       "              (q_proj): Linear(in_features=2560, out_features=2560, bias=True)\n",
       "              (out_proj): Linear(in_features=2560, out_features=2560, bias=True)\n",
       "            )\n",
       "            (activation_fn): ReLU()\n",
       "            (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
       "            (fc1): Linear(in_features=2560, out_features=10240, bias=True)\n",
       "            (fc2): Linear(in_features=10240, out_features=2560, bias=True)\n",
       "            (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (lm_head): Linear(in_features=2560, out_features=50272, bias=False)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "458a2709-b904-49af-8f10-41905e1cfdc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.parse as parse\n",
    "import os\n",
    "\n",
    "# a function to determine whether a string is a URL or not\n",
    "def is_url(string):\n",
    "    try:\n",
    "        result = parse.urlparse(string)\n",
    "        return all([result.scheme, result.netloc, result.path])\n",
    "    except:\n",
    "        return False\n",
    "    \n",
    "# a function to load an image\n",
    "def load_image(image_path):\n",
    "    if is_url(image_path):\n",
    "        return Image.open(requests.get(image_path, stream=True).raw)\n",
    "    elif os.path.exists(image_path):\n",
    "        return Image.open(image_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "af353956-7f42-43b3-bd5a-c720078e8a65",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_image = load_image(\"http://images.cocodataset.org/test-stuff2017/000000007226.jpg\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "bce7e019-d042-4f3d-9fc0-32617257f03c",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"a\"\n",
    "inputs = processor(raw_image, question, return_tensors=\"pt\").to(device, dtype=torch.float16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "8d989e92-71ed-438d-9150-31589ba00fb1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " vintage car driving down a street\n",
      "\n"
     ]
    }
   ],
   "source": [
    "out = model.generate(**inputs)\n",
    "print(processor.decode(out[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "d27e36e1-14bc-4535-9397-d716458594ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"a vintage car driving down a street\"\n",
    "inputs = processor(raw_image, question, return_tensors=\"pt\").to(device, dtype=torch.float16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "ebeea2b5-7b4d-4ef4-a2dc-c06876897361",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " with a man in the back seat\n",
      "\n"
     ]
    }
   ],
   "source": [
    "out = model.generate(**inputs)\n",
    "print(processor.decode(out[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b095054a-f62e-4b2e-b3af-6a5d69dae581",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"Question: What is the estimated year of these cars? Answer:\"\n",
    "inputs = processor(raw_image, question, return_tensors=\"pt\").to(device, dtype=torch.float16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ebd05f34-0d2e-46bd-a742-aca57138fb54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " The cars are from the early 1900's\n",
      "\n"
     ]
    }
   ],
   "source": [
    "out = model.generate(**inputs)\n",
    "print(processor.decode(out[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "7f16721e-cc71-4c5f-b352-920381177b06",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"Question: What is the color of the car? Answer:\"\n",
    "inputs = processor(raw_image, question, return_tensors=\"pt\").to(device, dtype=torch.float16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "4e49e1aa-6260-49a6-a7ed-67e356591948",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Green\n",
      "\n"
     ]
    }
   ],
   "source": [
    "out = model.generate(**inputs)\n",
    "print(processor.decode(out[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "373c0776-1c53-467a-b9c4-afdc71702ef2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
